{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import json \n",
    "import pandas\n",
    "import os.path\n",
    "from numpy import nan\n",
    "from enum import Enum"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## levels of alcohol service"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class __Alcohol(Enum):\n",
    "    none = 0\n",
    "    beer_and_wine = 1\n",
    "    full_bar = 2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## return clean and organised data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def getData(fromCache=True):\n",
    "    # Retrieved 09-07-2016 from https://nz.yelp.com/dataset_challenge/dataset \n",
    "    dataDirectory = '../../data/yelp/'\n",
    "    outputName = 'restaurants_clean.csv'\n",
    "    \n",
    "    if fromCache & os.path.isfile(dataDirectory + outputName):\n",
    "        return pandas.read_csv(dataDirectory + outputName, header=0)\n",
    "    \n",
    "    else:\n",
    "    \n",
    "        # read-in .json file as pandas dataframe\n",
    "        restaurants = (pandas.DataFrame.from_dict([json.loads(data) for data in \n",
    "                      open(dataDirectory + 'yelp_academic_dataset_business.json', 'r')])\n",
    "                     )\n",
    "\n",
    "        # select study area data & filter restaurants\n",
    "        restaurants = (restaurants[(restaurants['state'] == 'AZ') \n",
    "                                   & restaurants.categories.apply(lambda categories: \n",
    "                                                                   'Restaurants' in categories\n",
    "                                                                 )\n",
    "                                  ]\n",
    "                       .drop(['neighborhoods', \n",
    "                              'open', \n",
    "                              'city', \n",
    "                              'state', \n",
    "                              'type', \n",
    "                              'review_count', \n",
    "                              'stars'\n",
    "                             ], \n",
    "                             axis=1\n",
    "                            )\n",
    "                      )\n",
    "                                   \n",
    "        # recode nested variables\n",
    "        restaurants['beer_wine'] = restaurants.apply(\n",
    "            lambda row: \n",
    "                __Alcohol[row.attributes.get('Alcohol', 'none')]==__Alcohol['beer_and_wine'], \n",
    "            axis=1\n",
    "        ).astype(int)\n",
    "\n",
    "        restaurants['full_bar'] = restaurants.apply(\n",
    "            lambda row: \n",
    "                __Alcohol[row.attributes.get('Alcohol', 'none')]==__Alcohol['full_bar'], \n",
    "            axis=1\n",
    "        ).astype(int)\n",
    "        \n",
    "        restaurants['price_range'] = restaurants.apply(\n",
    "            lambda row: row.attributes.get('Price Range', nan), axis=1\n",
    "        )\n",
    "\n",
    "        restaurants['attire'] = restaurants.apply(\n",
    "            lambda row: row.attributes.get('Attire', 'casual'), axis=1\n",
    "        ).astype('category').cat.codes\n",
    "\n",
    "        restaurants['takeout'] = restaurants.apply(\n",
    "            lambda row: row.attributes.get('Take-out', False), axis=1\n",
    "        ).astype(int)\n",
    "\n",
    "        restaurants['waiter_service'] = restaurants.apply(\n",
    "            lambda row: row.attributes.get('Waiter Service', False), axis=1\n",
    "        ).astype(int)\n",
    "\n",
    "        restaurants['outdoor_seating'] = restaurants.apply(\n",
    "            lambda row: \n",
    "                row.attributes.get('Outdoor Seating', False), axis=1\n",
    "        ).astype(int)\n",
    "        \n",
    "        restaurants.drop(['attributes', 'categories', 'hours'],\n",
    "                          axis=1, inplace=True\n",
    "                        )\n",
    "        \n",
    "        #calculate additional variables\n",
    "        nameCounts = restaurants.groupby('name').size()\n",
    "        \n",
    "        restaurants['uniqueness'] = restaurants.apply(\n",
    "            lambda row: \n",
    "                1 / nameCounts[row['name']], axis=1\n",
    "        )\n",
    "        \n",
    "        # remove NAs        \n",
    "        restaurants.fillna(restaurants.mean(), inplace=True)\n",
    "    \n",
    "        # cache results and return\n",
    "        restaurants.to_csv(dataDirectory + outputName, index=False)\n",
    "        return restaurants"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
